# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.offline as pyo
pyo.init_notebook_mode()
# Getting the path of the folder
pwd = os.getcwd()
# Making a list of the files names
files = ['\Sales_January_2019.csv','\Sales_February_2019.csv','\Sales_March_2019.csv','\Sales_April_2019.csv','\Sales_May_2019.csv','\Sales_June_2019.csv','\Sales_July_2019.csv','\Sales_August_2019.csv',
'\Sales_September_2019.csv', '\Sales_October_2019.csv', '\Sales_November_2019.csv','\Sales_December_2019.csv']
# create empty list to store the data in
Each_month = []
# append datasets into the list
for i in range(len(files)):
Each_month.append(pd.read_csv(pwd+"\Data"+files[i]))
Each_month[0]
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1 | 700 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 |
| 1 | 141235 | Lightning Charging Cable | 1 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 |
| 2 | 141236 | Wired Headphones | 2 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 |
| 3 | 141237 | 27in FHD Monitor | 1 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 |
| 4 | 141238 | Wired Headphones | 1 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 |
| ... | ... | ... | ... | ... | ... | ... |
| 9718 | 150497 | 20in Monitor | 1 | 109.99 | 01/26/19 19:09 | 95 8th St, Dallas, TX 75001 |
| 9719 | 150498 | 27in FHD Monitor | 1 | 149.99 | 01/10/19 22:58 | 403 7th St, San Francisco, CA 94016 |
| 9720 | 150499 | ThinkPad Laptop | 1 | 999.99 | 01/21/19 14:31 | 214 Main St, Portland, OR 97035 |
| 9721 | 150500 | AAA Batteries (4-pack) | 2 | 2.99 | 01/15/19 14:21 | 810 2nd St, Los Angeles, CA 90001 |
| 9722 | 150501 | Google Phone | 1 | 600 | 01/13/19 16:43 | 428 Cedar St, Boston, MA 02215 |
9723 rows × 6 columns
# Combining the Tables into one table
df = pd.concat([Each_month[0], Each_month[1], Each_month[2], Each_month[3], Each_month[4], Each_month[5], Each_month[6], Each_month[7], Each_month[8], Each_month[9], Each_month[10], Each_month[11]], ignore_index=True)
df
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1 | 700 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 |
| 1 | 141235 | Lightning Charging Cable | 1 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 |
| 2 | 141236 | Wired Headphones | 2 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 |
| 3 | 141237 | 27in FHD Monitor | 1 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 |
| 4 | 141238 | Wired Headphones | 1 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 |
| ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1 | 14.95 | 12/11/19 20:58 | 14 Madison St, San Francisco, CA 94016 |
| 186846 | 319667 | AA Batteries (4-pack) | 2 | 3.84 | 12/01/19 12:01 | 549 Willow St, Los Angeles, CA 90001 |
| 186847 | 319668 | Vareebadd Phone | 1 | 400 | 12/09/19 06:43 | 273 Wilson St, Seattle, WA 98101 |
| 186848 | 319669 | Wired Headphones | 1 | 11.99 | 12/03/19 10:39 | 778 River St, Dallas, TX 75001 |
| 186849 | 319670 | Bose SoundSport Headphones | 1 | 99.99 | 12/21/19 21:45 | 747 Chestnut St, Los Angeles, CA 90001 |
186850 rows × 6 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 186850 entries, 0 to 186849 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 186305 non-null object 1 Product 186305 non-null object 2 Quantity Ordered 186305 non-null object 3 Price Each 186305 non-null object 4 Order Date 186305 non-null object 5 Purchase Address 186305 non-null object dtypes: object(6) memory usage: 8.6+ MB
# Checking the unique values of the product columns
# And found that the header in repeated as a record
df.Product.value_counts()
USB-C Charging Cable 21903 Lightning Charging Cable 21658 AAA Batteries (4-pack) 20641 AA Batteries (4-pack) 20577 Wired Headphones 18882 Apple Airpods Headphones 15549 Bose SoundSport Headphones 13325 27in FHD Monitor 7507 iPhone 6842 27in 4K Gaming Monitor 6230 34in Ultrawide Monitor 6181 Google Phone 5525 Flatscreen TV 4800 Macbook Pro Laptop 4724 ThinkPad Laptop 4128 20in Monitor 4101 Vareebadd Phone 2065 LG Washing Machine 666 LG Dryer 646 Product 355 Name: Product, dtype: int64
# Showing the records with the problem
df.query("Product == 'Product'")
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 1073 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 1102 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 1194 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 1897 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 2463 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| ... | ... | ... | ... | ... | ... | ... |
| 184931 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185070 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185481 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185925 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185955 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
355 rows × 6 columns
# Checking the NULLs
df.isnull().sum()
Order ID 545 Product 545 Quantity Ordered 545 Price Each 545 Order Date 545 Purchase Address 545 dtype: int64
# Checking Duplicates
pd.concat(g for _, g in df.groupby("Order ID") if len(g) > 1)
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 41 | 141275 | USB-C Charging Cable | 1 | 11.95 | 01/07/19 16:06 | 610 Walnut St, Austin, TX 73301 |
| 42 | 141275 | Wired Headphones | 1 | 11.99 | 01/07/19 16:06 | 610 Walnut St, Austin, TX 73301 |
| 57 | 141290 | Apple Airpods Headphones | 1 | 150 | 01/02/19 08:25 | 4 1st St, Los Angeles, CA 90001 |
| 58 | 141290 | AA Batteries (4-pack) | 3 | 3.84 | 01/02/19 08:25 | 4 1st St, Los Angeles, CA 90001 |
| 133 | 141365 | Vareebadd Phone | 1 | 400 | 01/10/19 11:19 | 20 Dogwood St, New York City, NY 10001 |
| ... | ... | ... | ... | ... | ... | ... |
| 184931 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185070 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185481 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185925 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
| 185955 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
15004 rows × 6 columns
# making a copy from the data
df_modified = df.copy()
df_modified = df_modified.query("Product != 'Product'")
df_modified.query("Product == 'Product'")
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
|---|
# Converting each column's type to its correct type
# but will leave order date with a string datatype to be able to convert it later
df_modified['Quantity Ordered'] = pd.to_numeric(df_modified['Quantity Ordered'])
df_modified['Price Each'] = pd.to_numeric(df_modified['Price Each'])
df_modified.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 186495 entries, 0 to 186849 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null float64 3 Price Each 185950 non-null float64 4 Order Date 185950 non-null object 5 Purchase Address 185950 non-null object dtypes: float64(2), object(4) memory usage: 10.0+ MB
# Changing some columns name to be efficient in python
df_modified.rename(columns={'Order ID': 'Order_ID', 'Quantity Ordered': 'Quantity_Ordered', 'Price Each':'Price_Each', 'Order Date': 'Order_Date', 'Purchase Address': 'Purchase_Address'}, inplace=True)
df_modified.columns
Index(['Order_ID', 'Product', 'Quantity_Ordered', 'Price_Each', 'Order_Date',
'Purchase_Address'],
dtype='object')
# Dropping the NULLs
df_modified.dropna(inplace=True)
# Checking the NULLs after removing it
df_modified.isnull().sum()
Order_ID 0 Product 0 Quantity_Ordered 0 Price_Each 0 Order_Date 0 Purchase_Address 0 dtype: int64
# Extracting the date only from the Order_Date column
df_modified['Date'] = df_modified['Order_Date'].apply(lambda x : x.split()[0])
#Extracting the Day from the Date column
df_modified['Day'] = df_modified['Date'].apply(lambda x : x.split("/")[1])
df_modified.head(2)
| Order_ID | Product | Quantity_Ordered | Price_Each | Order_Date | Purchase_Address | Date | Day | |
|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 | 01/22/19 | 22 |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 | 01/28/19 | 28 |
# Convert the Date datatype column to a datetime
df_modified['Date'] = pd.to_datetime(df_modified['Date'])
# Extracting the month and weekday from the date
df_modified['Month'] = pd.DatetimeIndex(df_modified['Date']).month
df_modified['DayOfWeek'] = df_modified['Date'].dt.day_name()
# Checking
df_modified.info()
df_modified
<class 'pandas.core.frame.DataFrame'> Int64Index: 185950 entries, 0 to 186849 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order_ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity_Ordered 185950 non-null float64 3 Price_Each 185950 non-null float64 4 Order_Date 185950 non-null object 5 Purchase_Address 185950 non-null object 6 Date 185950 non-null datetime64[ns] 7 Day 185950 non-null object 8 Month 185950 non-null int64 9 DayOfWeek 185950 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(6) memory usage: 15.6+ MB
| Order_ID | Product | Quantity_Ordered | Price_Each | Order_Date | Purchase_Address | Date | Day | Month | DayOfWeek | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 | 2019-01-22 | 22 | 1 | Tuesday |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 | 2019-01-28 | 28 | 1 | Monday |
| 2 | 141236 | Wired Headphones | 2.0 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 | 2019-01-17 | 17 | 1 | Thursday |
| 3 | 141237 | 27in FHD Monitor | 1.0 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 | 2019-01-05 | 05 | 1 | Saturday |
| 4 | 141238 | Wired Headphones | 1.0 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 | 2019-01-25 | 25 | 1 | Friday |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1.0 | 14.95 | 12/11/19 20:58 | 14 Madison St, San Francisco, CA 94016 | 2019-12-11 | 11 | 12 | Wednesday |
| 186846 | 319667 | AA Batteries (4-pack) | 2.0 | 3.84 | 12/01/19 12:01 | 549 Willow St, Los Angeles, CA 90001 | 2019-12-01 | 01 | 12 | Sunday |
| 186847 | 319668 | Vareebadd Phone | 1.0 | 400.00 | 12/09/19 06:43 | 273 Wilson St, Seattle, WA 98101 | 2019-12-09 | 09 | 12 | Monday |
| 186848 | 319669 | Wired Headphones | 1.0 | 11.99 | 12/03/19 10:39 | 778 River St, Dallas, TX 75001 | 2019-12-03 | 03 | 12 | Tuesday |
| 186849 | 319670 | Bose SoundSport Headphones | 1.0 | 99.99 | 12/21/19 21:45 | 747 Chestnut St, Los Angeles, CA 90001 | 2019-12-21 | 21 | 12 | Saturday |
185950 rows × 10 columns
# Converting the month from the month number to the month name
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
months_num = [1,2,3,4,5,6,7,8,9,10,11,12]
for n in months_num:
df_modified['Month'][df_modified['Month'] == n] = months[n-1]
df_modified
<ipython-input-76-e9540837e5e1>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| Order_ID | Product | Quantity_Ordered | Price_Each | Order_Date | Purchase_Address | Date | Day | Month | DayOfWeek | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 | 2019-01-22 | 22 | Jan | Tuesday |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 | 2019-01-28 | 28 | Jan | Monday |
| 2 | 141236 | Wired Headphones | 2.0 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 | 2019-01-17 | 17 | Jan | Thursday |
| 3 | 141237 | 27in FHD Monitor | 1.0 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 | 2019-01-05 | 05 | Jan | Saturday |
| 4 | 141238 | Wired Headphones | 1.0 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 | 2019-01-25 | 25 | Jan | Friday |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1.0 | 14.95 | 12/11/19 20:58 | 14 Madison St, San Francisco, CA 94016 | 2019-12-11 | 11 | Dec | Wednesday |
| 186846 | 319667 | AA Batteries (4-pack) | 2.0 | 3.84 | 12/01/19 12:01 | 549 Willow St, Los Angeles, CA 90001 | 2019-12-01 | 01 | Dec | Sunday |
| 186847 | 319668 | Vareebadd Phone | 1.0 | 400.00 | 12/09/19 06:43 | 273 Wilson St, Seattle, WA 98101 | 2019-12-09 | 09 | Dec | Monday |
| 186848 | 319669 | Wired Headphones | 1.0 | 11.99 | 12/03/19 10:39 | 778 River St, Dallas, TX 75001 | 2019-12-03 | 03 | Dec | Tuesday |
| 186849 | 319670 | Bose SoundSport Headphones | 1.0 | 99.99 | 12/21/19 21:45 | 747 Chestnut St, Los Angeles, CA 90001 | 2019-12-21 | 21 | Dec | Saturday |
185950 rows × 10 columns
# Checking
df_modified.Month.value_counts().sort_values()
Jan 9709 Sep 11621 Aug 11961 Feb 11975 Jun 13554 Jul 14293 Mar 15153 May 16566 Nov 17573 Apr 18279 Oct 20282 Dec 24984 Name: Month, dtype: int64
# Extracting the city and state from the address
df_modified['City'] = df_modified['Purchase_Address'].apply(lambda x : x.split(',')[1])
df_modified['State'] = df_modified['Purchase_Address'].apply(lambda x : x.split(',')[2])
df_modified['State'] = df_modified['State'].apply(lambda x : x.split()[0])
df_modified
| Order_ID | Product | Quantity_Ordered | Price_Each | Order_Date | Purchase_Address | Date | Day | Month | DayOfWeek | City | State | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 | 2019-01-22 | 22 | Jan | Tuesday | Boston | MA |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 | 2019-01-28 | 28 | Jan | Monday | Portland | OR |
| 2 | 141236 | Wired Headphones | 2.0 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 | 2019-01-17 | 17 | Jan | Thursday | San Francisco | CA |
| 3 | 141237 | 27in FHD Monitor | 1.0 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 | 2019-01-05 | 05 | Jan | Saturday | Los Angeles | CA |
| 4 | 141238 | Wired Headphones | 1.0 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 | 2019-01-25 | 25 | Jan | Friday | Austin | TX |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1.0 | 14.95 | 12/11/19 20:58 | 14 Madison St, San Francisco, CA 94016 | 2019-12-11 | 11 | Dec | Wednesday | San Francisco | CA |
| 186846 | 319667 | AA Batteries (4-pack) | 2.0 | 3.84 | 12/01/19 12:01 | 549 Willow St, Los Angeles, CA 90001 | 2019-12-01 | 01 | Dec | Sunday | Los Angeles | CA |
| 186847 | 319668 | Vareebadd Phone | 1.0 | 400.00 | 12/09/19 06:43 | 273 Wilson St, Seattle, WA 98101 | 2019-12-09 | 09 | Dec | Monday | Seattle | WA |
| 186848 | 319669 | Wired Headphones | 1.0 | 11.99 | 12/03/19 10:39 | 778 River St, Dallas, TX 75001 | 2019-12-03 | 03 | Dec | Tuesday | Dallas | TX |
| 186849 | 319670 | Bose SoundSport Headphones | 1.0 | 99.99 | 12/21/19 21:45 | 747 Chestnut St, Los Angeles, CA 90001 | 2019-12-21 | 21 | Dec | Saturday | Los Angeles | CA |
185950 rows × 12 columns
df_modified['State'].value_counts()
CA 74337 NY 24876 TX 24725 MA 19934 GA 14881 WA 14732 OR 10010 ME 2455 Name: State, dtype: int64
# Getting the total price for each order
df_modified['Total_Price'] = df_modified['Quantity_Ordered']* df_modified['Price_Each']
df_modified
| Order_ID | Product | Quantity_Ordered | Price_Each | Order_Date | Purchase_Address | Date | Day | Month | DayOfWeek | City | State | Total_Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 | 2019-01-22 | 22 | Jan | Tuesday | Boston | MA | 700.00 |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 | 2019-01-28 | 28 | Jan | Monday | Portland | OR | 14.95 |
| 2 | 141236 | Wired Headphones | 2.0 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 | 2019-01-17 | 17 | Jan | Thursday | San Francisco | CA | 23.98 |
| 3 | 141237 | 27in FHD Monitor | 1.0 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 | 2019-01-05 | 05 | Jan | Saturday | Los Angeles | CA | 149.99 |
| 4 | 141238 | Wired Headphones | 1.0 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 | 2019-01-25 | 25 | Jan | Friday | Austin | TX | 11.99 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1.0 | 14.95 | 12/11/19 20:58 | 14 Madison St, San Francisco, CA 94016 | 2019-12-11 | 11 | Dec | Wednesday | San Francisco | CA | 14.95 |
| 186846 | 319667 | AA Batteries (4-pack) | 2.0 | 3.84 | 12/01/19 12:01 | 549 Willow St, Los Angeles, CA 90001 | 2019-12-01 | 01 | Dec | Sunday | Los Angeles | CA | 7.68 |
| 186847 | 319668 | Vareebadd Phone | 1.0 | 400.00 | 12/09/19 06:43 | 273 Wilson St, Seattle, WA 98101 | 2019-12-09 | 09 | Dec | Monday | Seattle | WA | 400.00 |
| 186848 | 319669 | Wired Headphones | 1.0 | 11.99 | 12/03/19 10:39 | 778 River St, Dallas, TX 75001 | 2019-12-03 | 03 | Dec | Tuesday | Dallas | TX | 11.99 |
| 186849 | 319670 | Bose SoundSport Headphones | 1.0 | 99.99 | 12/21/19 21:45 | 747 Chestnut St, Los Angeles, CA 90001 | 2019-12-21 | 21 | Dec | Saturday | Los Angeles | CA | 99.99 |
185950 rows × 13 columns
# Dropping order_date and address columns
df_modified.drop(columns=['Order_Date', 'Purchase_Address', 'Date'], inplace=True)
df_modified
| Order_ID | Product | Quantity_Ordered | Price_Each | Day | Month | DayOfWeek | City | State | Total_Price | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 141234 | iPhone | 1.0 | 700.00 | 22 | Jan | Tuesday | Boston | MA | 700.00 |
| 1 | 141235 | Lightning Charging Cable | 1.0 | 14.95 | 28 | Jan | Monday | Portland | OR | 14.95 |
| 2 | 141236 | Wired Headphones | 2.0 | 11.99 | 17 | Jan | Thursday | San Francisco | CA | 23.98 |
| 3 | 141237 | 27in FHD Monitor | 1.0 | 149.99 | 05 | Jan | Saturday | Los Angeles | CA | 149.99 |
| 4 | 141238 | Wired Headphones | 1.0 | 11.99 | 25 | Jan | Friday | Austin | TX | 11.99 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 186845 | 319666 | Lightning Charging Cable | 1.0 | 14.95 | 11 | Dec | Wednesday | San Francisco | CA | 14.95 |
| 186846 | 319667 | AA Batteries (4-pack) | 2.0 | 3.84 | 01 | Dec | Sunday | Los Angeles | CA | 7.68 |
| 186847 | 319668 | Vareebadd Phone | 1.0 | 400.00 | 09 | Dec | Monday | Seattle | WA | 400.00 |
| 186848 | 319669 | Wired Headphones | 1.0 | 11.99 | 03 | Dec | Tuesday | Dallas | TX | 11.99 |
| 186849 | 319670 | Bose SoundSport Headphones | 1.0 | 99.99 | 21 | Dec | Saturday | Los Angeles | CA | 99.99 |
185950 rows × 10 columns
df_modified.groupby('Month')['Total_Price'].sum().sort_values(ascending=False).plot(kind = 'bar', figsize=(12,6))
plt.ylabel('Total sales by $Millions', fontsize=14)
plt.xlabel('Months', fontsize=14)
plt.xticks(rotation=0)
plt.title('Total Sales Per Month', fontsize=18)
print("The month with the largest sales is December")
The month with the largest sales is December
df_modified.groupby('Day')['Quantity_Ordered'].sum().plot(kind = 'bar', figsize=(12,6))
plt.ylabel('Number of orders', fontsize=14)
plt.xlabel('Day', fontsize=14)
plt.xticks(rotation=0)
plt.title('Total orders per day', fontsize=18)
print("There is no different between the days in orders numbers.")
There is no different between the days in orders numbers.
df_modified.groupby('DayOfWeek')['Quantity_Ordered'].sum().sort_values(ascending=False).plot(kind = 'bar', figsize=(12,6))
plt.ylabel('Number of orders', fontsize=14)
plt.xlabel('Weekday', fontsize=14)
plt.xticks(rotation=0)
plt.title('Total orders per weekday', fontsize=18)
print("There is no different between the weekdays.")
There is no different between the weekdays.
df_modified.Price_Each.hist(figsize=(12,6), grid = False);
plt.xlabel('Prices Ranges', fontsize=14);
plt.ylabel('Frequency', fontsize=14);
plt.title('The Range of prices people tend to buy', fontsize=18)
print("It looks like people like to buy cheaper products more than the expensive ones")
It looks like people like to buy cheaper products more than the expensive ones
fig = px.sunburst(df_modified, path =["State","City", "Product"], values = "Quantity_Ordered", width = 1000, height= 800, color_continuous_scale='Teal', color = "Quantity_Ordered")
print("California is the State with the biggest orders\nSan Francisco is the City with the biggest orders\nBatteries are the most requested product")
fig.show()
California is the State with the biggest orders San Francisco is the City with the biggest orders Batteries are the most requested product
df_modified.groupby('Product')['Total_Price'].sum().sort_values().plot(kind='barh',fontsize = 14, figsize=(8,8))
plt.ylabel('Product', fontsize=14)
plt.xlabel('Total Sales by $Millions', fontsize=14)
plt.title('Total Sales per Product', fontsize=18)
print("Despite the Batteries are the most product to be sold, \nThe Macbook Pro Laptop is the most profitable product and the Batteries is the least profitable Products.")
Despite the Batteries are the most product to be sold, The Macbook Pro Laptop is the most profitable product and the Batteries is the least profitable Products.
# Saving the final dataset to excel file
df_modified.to_excel('FinalSales.xlsx', index=False)